library(tidyverse)
## -- Attaching packages ------------------------------------------------- tidyverse 1.3.0 --
## <U+2713> ggplot2 3.2.1     <U+2713> purrr   0.3.3
## <U+2713> tibble  2.1.3     <U+2713> dplyr   0.8.3
## <U+2713> tidyr   1.0.0     <U+2713> stringr 1.4.0
## <U+2713> readr   1.3.1     <U+2713> forcats 0.4.0
## -- Conflicts ---------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
## 
##     set_names
## The following object is masked from 'package:tidyr':
## 
##     extract
drivers <- feather::read_feather("drivers.feather")
incidents <- feather::read_feather("incidents.feather")
non_motorists <- feather::read_feather("non_motorists.feather")
combo <- feather::read_feather("combo.feather")
incident_locations <- incidents %>%
  filter(!not_in_county) %>%
  select(longitude, latitude)
kmeans_list <- lapply(1:15, function(x) kmeans(incident_locations, centers = x, nstart = 25, iter.max = 100))
## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2900950)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2900950)

## Warning: Quick-TRANSfer stage steps exceeded maximum (= 2900950)
centroidss <- data.frame(
  centroids = 1:15,
  sum_of_squares = sapply(1:15, function(x) kmeans_list[[x]]$tot.withinss)
)
centroidss %>%
  ggplot(aes(x = centroids, y = sum_of_squares)) +
  geom_line()

centroidss %>% knitr::kable()
centroids sum_of_squares
1 819.73958
2 310.00688
3 216.66009
4 174.94499
5 142.10053
6 119.37358
7 101.20404
8 88.70035
9 76.81700
10 68.04903
11 62.61545
12 56.46295
13 52.08254
14 48.25165
15 45.18553
incident_clusters <- incident_locations
for (x in 1:15) {
  incident_clusters$cluster <-
    as.factor(paste0("cluster", kmeans_list[[x]]$cluster))
  temp <- incident_clusters %>%
    ggplot(aes(x = longitude, y = latitude, color = cluster)) +
    geom_point()
  temp %>% print()
}

contingency_table <- function(data, var1, var2) {
  data %>%
    group_by(!!enquo(var1), !!enquo(var2)) %>%
    tally() %>%
    ggplot() +
    geom_bin2d(aes(x = !!enquo(var1), fill = log(n), y = !!enquo(var2))) +
    geom_text(aes(x = !!enquo(var1), label = n, y = !!enquo(var2))) +
    scale_fill_gradient(
      low = "#ece7f2",
      high = "#a6bddb"
    ) +
    theme(
      panel.grid = element_blank(),
      legend.position = "none",
      panel.background = element_rect(fill = "white")
    )
}
temp <- chisq.test(as.factor(incidents$weather), as.factor(incidents$acrs_report_type))
## Warning in chisq.test(as.factor(incidents$weather),
## as.factor(incidents$acrs_report_type)): Chi-squared approximation may be
## incorrect
temp %>%
  broom::tidy() %>%
  knitr::kable()
statistic p.value parameter method
172.3132 0 24 Pearson’s Chi-squared test

p-value 2.129059810^{-24}

contingency_table(incidents, acrs_report_type, weather)

Helmets

helmet_data <- non_motorists %>%
  filter(as.logical(pedestrian_type_bicyclist)) %>%
  select(safety_equipment, injury_severity) %>%
  mutate(safety_equipment = c("no helmet", "helmet")[1 + as.integer(str_detect(tolower(safety_equipment), "helmet"))])

temp <- chisq.test(as.factor(helmet_data$safety_equipment), as.factor(helmet_data$injury_severity))
## Warning in chisq.test(as.factor(helmet_data$safety_equipment),
## as.factor(helmet_data$injury_severity)): Chi-squared approximation may be
## incorrect
temp %>%
  broom::tidy() %>%
  knitr::kable()
statistic p.value parameter method
7.961767 0.0929888 4 Pearson’s Chi-squared test
contingency_table(helmet_data, safety_equipment, injury_severity)

Speed Limits

ordered_injury <- drivers %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)

speed_limit_data <- drivers %>%
  select(speed_limit, injury_severity) %>%
  filter(speed_limit > 20 & speed_limit < 60) %>%
  mutate(
    `Speed Limit` = factor(speed_limit),
    `Injury Severity` = factor(injury_severity, ordered_injury)
  )

temp <- chisq.test(as.factor(speed_limit_data$speed_limit), as.factor(speed_limit_data$injury_severity))
## Warning in chisq.test(as.factor(speed_limit_data$speed_limit),
## as.factor(speed_limit_data$injury_severity)): Chi-squared approximation may be
## incorrect
temp %>%
  broom::tidy() %>%
  knitr::kable()
statistic p.value parameter method
669.7539 0 24 Pearson’s Chi-squared test

p-value 5.647136710^{-126}

contingency_table(speed_limit_data, `Speed Limit`, `Injury Severity`)

ordered_injury <- drivers %>%
  count(injury_severity) %>%
  arrange(desc(n)) %>%
  pull(injury_severity)

speed_limit_data <- drivers %>%
  select(speed_limit, injury_severity) %>%
  mutate(
    speed_limit = factor(c("Under 35", "35 - 45", "50+")[1 + findInterval(speed_limit, c(31, 46))], levels = c("Under 35", "35 - 45", "50+")),
    injury_severity = factor(injury_severity, ordered_injury)
  )

temp <- chisq.test(as.factor(speed_limit_data$speed_limit), as.factor(speed_limit_data$injury_severity))
## Warning in chisq.test(as.factor(speed_limit_data$speed_limit),
## as.factor(speed_limit_data$injury_severity)): Chi-squared approximation may be
## incorrect
temp %>%
  broom::tidy() %>%
  knitr::kable()
statistic p.value parameter method
1000.369 0 8 Pearson’s Chi-squared test

p-value 1.242907610^{-210}

contingency_table(speed_limit_data, speed_limit, injury_severity)

temp <- combo %>%
  filter(
    road_division %in% c(
      "TWO-WAY, NOT DIVIDED WITH A CONTINUOUS LEFT TURN",
      "ONE-WAY TRAFFICWAY", "TWO-WAY, DIVIDED",
      "UNPROTECTED PAINTED MIN 4 FEET",
      "TWO-WAY, DIVIDED, POSITIVE MEDIAN BARRIER",
      "TWO-WAY, NOT DIVIDED"
    )
  ) %>%
  mutate(
    severity_index =
      d_vehicle_damage_extent_functional +
        2 * d_vehicle_damage_extent_superficial +
        4 * d_vehicle_damage_extent_disabling +
        6 * d_vehicle_damage_extent_destroyed +
        10 * d_injury_severity_fatal_injury +
        8 * d_injury_severity_suspected_serious_injury +
        4 * d_injury_severity_suspected_minor_injury +
        2 * d_injury_severity_possible_injury +
        10 * nm_injury_severity_fatal_injury +
        8 * nm_injury_severity_suspected_serious_injury +
        4 * nm_injury_severity_suspected_minor_injury +
        2 * nm_injury_severity_possible_injury,
    road_division = as.factor(road_division)
  )
division_test_anova <- aov(severity_index ~ road_division, data = temp)
broom::tidy(division_test_anova) %>% rmarkdown::paged_table()
#division_test_kruskal <- kruskal.test(severity_index ~ road_division, data = temp)
plot(division_test_anova, 1)

ggplot(temp) + geom_density(aes(x = severity_index))

qqnorm(temp$severity_index)

plot(division_test_anova, 2)

division_test_oneway <- oneway.test(severity_index ~ road_division, data = temp)
broom::tidy(division_test_oneway) %>% rmarkdown::paged_table()
## Multiple parameters; naming those columns num.df, denom.df
division_test_pairwiset <- pairwise.t.test(temp$severity_index, temp$road_division, p.adjust.method = "BH", pool.sd = FALSE)
broom::tidy(division_test_pairwiset) %>% rmarkdown::paged_table()